@@ -4,7 +4,6 @@ require 'date' |
||
4 | 4 |
|
5 | 5 |
module Agents |
6 | 6 |
class WebsiteAgent < Agent |
7 |
- cannot_receive_events! |
|
8 | 7 |
|
9 | 8 |
default_schedule "every_12h" |
10 | 9 |
|
@@ -46,6 +45,8 @@ module Agents |
||
46 | 45 |
Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance). This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results. |
47 | 46 |
|
48 | 47 |
Set `force_encoding` to an encoding name if the website does not return a Content-Type header with a proper charset. |
48 |
+ |
|
49 |
+ The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload. |
|
49 | 50 |
MD |
50 | 51 |
|
51 | 52 |
event_description do |
@@ -105,19 +106,23 @@ module Agents |
||
105 | 106 |
end |
106 | 107 |
|
107 | 108 |
def check |
108 |
- hydra = Typhoeus::Hydra.new |
|
109 | 109 |
log "Fetching #{options['url']}" |
110 |
+ check_url options['url'] |
|
111 |
+ end |
|
112 |
+ |
|
113 |
+ def check_url(in_url) |
|
114 |
+ hydra = Typhoeus::Hydra.new |
|
110 | 115 |
request_opts = { :followlocation => true } |
111 | 116 |
request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present? |
112 | 117 |
|
113 | 118 |
requests = [] |
114 | 119 |
|
115 |
- if options['url'].kind_of?(Array) |
|
116 |
- options['url'].each do |url| |
|
120 |
+ if in_url.kind_of?(Array) |
|
121 |
+ in_url.each do |url| |
|
117 | 122 |
requests.push(Typhoeus::Request.new(url, request_opts)) |
118 | 123 |
end |
119 | 124 |
else |
120 |
- requests.push(Typhoeus::Request.new(options['url'], request_opts)) |
|
125 |
+ requests.push(Typhoeus::Request.new(in_url, request_opts)) |
|
121 | 126 |
end |
122 | 127 |
|
123 | 128 |
requests.each do |request| |
@@ -185,7 +190,7 @@ module Agents |
||
185 | 190 |
options['extract'].keys.each do |name| |
186 | 191 |
result[name] = output[name][index] |
187 | 192 |
if name.to_s == 'url' |
188 |
- result[name] = URI.join(options['url'], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil? |
|
193 |
+ result[name] = URI.join(request.base_url, result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil? |
|
189 | 194 |
end |
190 | 195 |
end |
191 | 196 |
|
@@ -202,6 +207,13 @@ module Agents |
||
202 | 207 |
end |
203 | 208 |
end |
204 | 209 |
|
210 |
+ def receive(incoming_events) |
|
211 |
+ incoming_events.each do |event| |
|
212 |
+ url_to_scrape = Utils.value_at(event['payload'], 'url') |
|
213 |
+ check_url(url_to_scrape) |
|
214 |
+ end |
|
215 |
+ end |
|
216 |
+ |
|
205 | 217 |
private |
206 | 218 |
|
207 | 219 |
# This method returns true if the result should be stored as a new event. |
@@ -275,5 +287,7 @@ module Agents |
||
275 | 287 |
false |
276 | 288 |
end |
277 | 289 |
end |
290 |
+ |
|
278 | 291 |
end |
292 |
+ |
|
279 | 293 |
end |
@@ -331,6 +331,19 @@ describe Agents::WebsiteAgent do |
||
331 | 331 |
end |
332 | 332 |
end |
333 | 333 |
end |
334 |
+ |
|
335 |
+ describe "#receive" do |
|
336 |
+ it "should scrape from the url element in incoming event payload" do |
|
337 |
+ @event = Event.new |
|
338 |
+ @event.agent = agents(:bob_rain_notifier_agent) |
|
339 |
+ @event.payload = { 'url' => "http://xkcd.com" } |
|
340 |
+ |
|
341 |
+ lambda { |
|
342 |
+ @checker.options = @site |
|
343 |
+ @checker.receive([@event]) |
|
344 |
+ }.should change { Event.count }.by(1) |
|
345 |
+ end |
|
346 |
+ end |
|
334 | 347 |
end |
335 | 348 |
|
336 | 349 |
describe "checking with http basic auth" do |
@@ -361,4 +374,4 @@ describe Agents::WebsiteAgent do |
||
361 | 374 |
end |
362 | 375 |
end |
363 | 376 |
end |
364 |
-end |
|
377 |
+end |